*This file takes the monthly panel dataset prepared by 2.regressions.do and refines the sample to the primary analysis sample. It also defines the variables which are needed in the regression analysis, converting the panel into a single observation per child.

********************************************************************************
*DEFINE DIRECTORIES
local home CHILD
local main CHILD/JPE
local logs CHILD/JPE/logs
local data CHILD/JPE/data
local results CHILD/JPE/results
local network NETWORK
********************************************************************************

*bring in data
cd "`main'"
use final_24after_3mon.dta, clear

rename zcta_kid zip
cd "`home'"
merge m:1 zip year using zip_county_crosswalk_insurer.dta
drop if _merge==2

replace county = "35039" if zip=="87533"
replace county = "21125" if zip=="40743"
replace county = "25013" if zip=="01086"
drop _merge

*merge JPE sample flag
merge 1:1 patientid using jpe_sample.dta
keep if _merge==3
drop _merge

*generate treatments (exclude month 0 prescriptions)
generate drug = i_drug0 + i_drug1 + i_drug2 + i_drug3 >0

*parse variables
drop i_drug* hospital_*
order total_spend* mh_spend* i_hospital* i_er* hospital* time_in*, after(bad)

*get rid of outcomes beyond period observed
local 1 24
local start = `1'+1
forvalues i = `start'(1)77 {
drop total_spend`i' mh_spend`i' i_hospital`i' i_er`i' hospital`i' time_in`i'
}

*hard code the zeros
foreach var of varlist total_spend0-time_in24 {
replace `var' =0 if `var'==.
}

*generate 3, 6, and 12 month outcomes
egen tot3m = rowtotal(total_spend0-total_spend3)
egen mh3m = rowtotal(mh_spend0-mh_spend3)
generate hosp3m = i_er0+i_er1+i_er2+i_er3+i_hospital0+i_hospital1+i_hospital2+i_hospital3>0
egen hospital3m = rowtotal(hospital0-hospital3)
egen timein3m = rowtotal(time_in0-time_in3)
generate er3m = i_er0+i_er1+i_er2+i_er3>0
*replace timein3m = 1 if timein3m>1

egen tot6m = rowtotal(total_spend0-total_spend6)
egen mh6m = rowtotal(mh_spend0-mh_spend6)
generate hosp6m = hosp3+i_er4+i_er5+i_er6+i_hospital4+i_hospital5+i_hospital6>0
egen hospital6m = rowtotal(hospital0-hospital6)
egen timein6m = rowtotal(time_in0-time_in6)
generate er6m = er3m + i_er4+i_er5+i_er6 >0
*replace timein6m = 1 if timein6m>1

egen tot12m = rowtotal(total_spend0-total_spend12)
egen mh12m = rowtotal(mh_spend0-mh_spend12)
egen hospital12m = rowtotal(hospital0-hospital12)
egen temp1 = rowtotal(i_hospital0-i_hospital12)
egen temp2 = rowtotal(i_er0-i_er12)
generate hosp12m= temp1+temp2>0
drop temp1 temp2
egen timein12m = rowtotal(time_in0-time_in12)
generate er12m = er6m + i_er7 + i_er8 + i_er9 + i_er10 + i_er11 + i_er12>0
*replace timein12m = 1 if timein12m>1

*outcomes for 24 month folks
if `1'==24 {

egen tot18m = rowtotal(total_spend0-total_spend18)
egen mh18m = rowtotal(mh_spend0-mh_spend18)
egen temp1 = rowtotal(i_hospital0-i_hospital18)
egen temp2 = rowtotal(i_er0-i_er18)
generate hosp18m= temp1+temp2>0
drop temp1 temp2
egen hospital18m = rowtotal(hospital0-hospital18)
egen timein18m = rowtotal(time_in0-time_in18)
*replace timein18m = 1 if timein18m>1
generate er18m = er12m + i_er13 + i_er14 + i_er15 + i_er16 + i_er17 + i_er18 >0

egen tot24m = rowtotal(total_spend0-total_spend24)
egen mh24m = rowtotal(mh_spend0-mh_spend24)
egen temp1 = rowtotal(i_hospital0-i_hospital24)
egen temp2 = rowtotal(i_er0-i_er24)
generate hosp24m= temp1+temp2>0
drop temp1 temp2
egen hospital24m = rowtotal(hospital0-hospital24)
egen timein24m = rowtotal(time_in0-time_in24)
*replace timein24m = 1 if timein24m>1
generate er24m = er18m + i_er19 + i_er20 + i_er21 + i_er22 + i_er23 + i_er24 >0

}

*drop underlying variables
drop total_spend* mh_spend* i_hospital* i_er* time_in*
drop hospital0-hospital12
if `1'==24 {
drop hospital13-hospital24
}

if `1'==24 {
local outcomes tot3m tot6m tot12m tot18m tot24m mh3m mh6m mh12m mh18m mh24m hosp3m hosp6m hosp12m hosp18m hosp24m timein3m timein6m timein12m timein18m timein24m er3m er6m er12m er18m er24m
}

*log transformation of $$$ variables
foreach var of varlist tot* mh* {
generate `var'_orig = `var'
replace `var' = log(`var'+1)
}

*generate month ids
generate month = month(first_mh)

*save regression data
save regression_data_final.dta, replace

*drop variables
drop share_mh-share_gp

*merge in market measures (new!)
rename zip zcta_kid
merge m:1 zcta_kid year using market-measures.dta
keep if _merge==3
drop _merge

*adjust geography
*keep any zip3 which has at least 150 kids
*if fewer, then hard code to the state
generate i = 1
by zip3, s: egen count=sum(i)
generate state = substr(county,1,2)
destring state, replace
replace state = 10000+state
replace zip3 = state if count<150

*save regression data
save regression_data_final_jpe.dta, replace
